The purpose of the case study is to classify a given silhouette as one of four different types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
Attribute Information:
Where sigma_maj^2 is the variance along the major axis and sigma_min^2 is the variance along the minor axis, and area of hollows = area of bounding poly-area of object
# To enable plotting graphs in Jupyter notebook
%matplotlib inline
# Numerical libraries
import numpy as np
from sklearn.model_selection import train_test_split
# Import Linear Regression machine learning library
from sklearn.tree import DecisionTreeRegressor
# to handle data in form of rows and columns
import pandas as pd
# importing ploting libraries
import matplotlib.pyplot as plt
#importing seaborn for statistical plots
import seaborn as sns
#importing svm
from sklearn.svm import SVC
# importing GridSearchcv
from sklearn.model_selection import GridSearchCV
#importing kfold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
#metrics
from sklearn import metrics
from sklearn.metrics import classification_report, roc_auc_score, auc
from sklearn.utils import resample
from sklearn import preprocessing
random_state = 42
np.random.seed(random_state)
# Suppress warnings
import warnings; warnings.filterwarnings('ignore')
We read the csv file given into a dataframe
data = pd.read_csv("vehicle-1.csv")
print('The shape of the data is',data.shape)
data.head()
data.tail()
data.info()
As we can see there are some missing values in our dataset and they have to be handled before machine learning algorithm are applied
# Check for any Non-Real value present in the dataset such as '?' or '*' etc.
data[~data.iloc[:,:-1].applymap(np.isreal).all(1)]
data.isnull().sum()
Observation:
nulls = data.isnull().sum()
nulls[nulls > 0]
data.fillna(data.median(), inplace=True)
data.isnull().sum()
# A quick check to find columns that contain outliers
fig = plt.figure(figsize = (15, 10))
ax = sns.boxplot(data = data.iloc[:, 0:18], orient = 'h')
As we can see, there are outliers
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).sum()
for idx, col in enumerate(data.columns[:-1]):
q1 = data[col].quantile(0.25)
q3 = data[col].quantile(0.75)
low = q1 - 1.5*(q3 - q1)
high = q3 + 1.5*(q3 - q1)
data = data[(data[col] < high) & (data[col] > low)]
# Boxplot after removing outliers
# A quick check to find columns that contain outliers
fig = plt.figure(figsize = (15, 10))
ax = sns.boxplot(data = data.iloc[:, 0:18], orient = 'h')
data.info()
# Five point summary
data.describe().T
We perform univariate analysis to know the distribution of independent variables.
for i in data.columns:
if i!='class':
print("5 Point Summary of", i, "column is:")
print(data[i].describe().T)
ax = sns.distplot(data[i], color="m")
plt.tight_layout()
plt.show()
print("*"*100)
print(" "*200)
Observation:
Bivariate analysis is the analysis of two variables where two variables are analysed to explore the relationship/association between them
Since All of our data is numerical, we can infer results from correlation coefficient.
sns.pairplot(data, diag_kind='kde', hue = 'class') # to plot density curve instead of histogram
Observation:
# Visualize the correlation among independent features
from matplotlib import pyplot as plt
plt.figure(figsize=(18,18))
ax = sns.heatmap(data.corr(), vmax=.6, square=True, fmt='.2f', annot=True, linecolor='white', linewidths=0.4)
plt.title('Correlation heatmap')
plt.show()
data.groupby(by=['class'])['compactness'].sum().reset_index().sort_values(['compactness']).tail(10).plot(x='class',y='compactness',kind='bar',figsize=(15,5))
plt.show()
As we can see the below plots, we can see that joint plot gives straight line parallel to x axis which indicates a week relation betweent hose two attributes
for i in ['pr.axis_rectangularity', 'max.length_rectangularity',
'scaled_variance', 'scaled_variance.1']:
sns.jointplot(i, 'hollows_ratio', data, kind='reg', size=7, color='r')
sns.jointplot('skewness_about', 'hollows_ratio', data, kind='reg', size=7, color='r')
sns.jointplot('skewness_about', 'skewness_about.1', data, kind='reg', size=7, color='r')
sns.jointplot('max.length_rectangularity', 'skewness_about.1', data, kind='reg', size=7, color='r')
sns.jointplot('scaled_radius_of_gyration.1', 'scatter_ratio', data, kind='reg', size=7, color='r')
for i in ['skewness_about.2', 'hollows_ratio']:
sns.jointplot(i, 'scaled_radius_of_gyration.1', data, kind='reg', size=7, color='r')
Observation:
for i in ['compactness', 'circularity', 'distance_circularity', 'radius_ratio','pr.axis_rectangularity', 'max.length_rectangularity',
'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration']:
sns.jointplot(i, 'elongatedness', data, kind='reg', size=7, color='r')
Observation:
for i in ['skewness_about.2', 'hollows_ratio']:
sns.jointplot(i, 'scaled_radius_of_gyration.1', data, kind='reg', size=7, color='r')
Observation:
for i in data.columns:
if i!='class':
plt.figure(figsize=(10,4))
plt.title(i+' and class Boxplot')
sns.boxplot(x='class', y=i, data=data,hue='class')
plt.show()
plt.figure(figsize=(10,4))
plt.title(i+' and class Boxplot')
sns.violinplot(x='class', y=i, data=data,hue='class')
plt.show()
Observation:
data['class'].value_counts()
data['class'].value_counts(normalize = True)*100
# import the pyplot library
import matplotlib.pyplot as plotter
# The slice names of a population distribution pie chart
pieLabels = 'car', 'bus', 'van'
# Population data
populationShare = [410/801, 203/801, 188/801]
figureObject, axesObject = plotter.subplots()
# Draw the pie chart
axesObject.pie(populationShare,labels=pieLabels,autopct='%1.2f',startangle=90)
# Aspect ratio - equal means pie is a circle
axesObject.axis('equal')
plotter.show()
As we can see, the car:van:bus is almost equal to 2:1:1 and we can treat it as a balanced dataset and hence there is no need of balancing
We can get feaure importances using Random Forest
Let us apply random forest and get feature importance so that we can decide whether to keep or delete a particular attribute
#import random forest
from sklearn.ensemble import RandomForestClassifier
# Divide the dataset into Input features and Target variables
a = data.drop('class', axis=1)
b = data['class']
rf = RandomForestClassifier().fit(a, b)
f_p = rf.feature_importances_
res = list(zip(a.columns, f_p*100))
for i in res:
print(i)
Observation:
#Seperating X and y from data dataframe
X = data.drop(['class'], axis = 1)
y = data['class']
#splitting into train and test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#standarizing train and test
x_train = StandardScaler().fit_transform(x_train)
x_test = StandardScaler().fit_transform(x_test)
# defining parameter range
param_grid = {'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000, 10000],
'gamma': [10000, 1000, 100, 10, 1, 0.5, 0.1, 0.01, 0.001],
'kernel': ['rbf','linear','poly']}
grid = GridSearchCV(SVC(), param_grid, cv=5,verbose = 0)
# fitting the model for grid search
best_model=grid.fit(x_train, y_train)
# View best hyperparameters
print('Best gamma:', best_model.best_estimator_.get_params()['gamma'])
print('Best C:', best_model.best_estimator_.get_params()['C'])
print('Best kernel:', best_model.best_estimator_.get_params()['kernel'])
best_gamma=best_model.best_estimator_.get_params()['gamma']
best_c=best_model.best_estimator_.get_params()['C']
best_kernel=best_model.best_estimator_.get_params()['kernel']
print("Best Score:", best_model.best_score_)
svm=SVC(C=best_c,gamma=best_gamma,kernel=best_kernel)
svm.fit(x_train,y_train)
predicted_labels = svm.predict(x_test)
print("Accuracy: ",svm.score(x_test, y_test))
print(" "*100)
#Confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score
print(confusion_matrix(y_test, predicted_labels))
print("Accuracy Score:",accuracy_score(y_test, predicted_labels))
target_names = ['car', 'bus','van']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, predicted_labels, target_names=target_names))
Observation:
# we run kfold cross validation with the default parameters
num_folds = 10
seed = 42
kfold = KFold(n_splits=num_folds, random_state=seed)
model = SVC()
model.fit(x_train, y_train)
results = cross_val_score(model, x_test, y_test, cv=kfold)
print(results)
print("max accuracy:", results.max())
print("min accuracy:", results.min())
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
# we run kfold cross validation with the best parameters found before
num_folds = 10
seed = 42
kfold = KFold(n_splits=num_folds, random_state=seed)
model = SVC(C=best_c,gamma=best_gamma,kernel=best_kernel)
model.fit(x_train, y_train)
results = cross_val_score(model, x_test, y_test, cv=kfold)
print(results)
print("max accuracy:", results.max())
print("min accuracy:", results.min())
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
from scipy import stats
Xscaled = stats.zscore(X)
pca = PCA(n_components=18)
pca.fit(Xscaled)
eig_vals, eig_vecs = pca.explained_variance_, pca.components_
Eigen Values
print(pca.explained_variance_)
Eigen Vectors
print(pca.components_)
And the percentage of variation explained by each eigen Vector
print(pca.explained_variance_ratio_)
tot = sum(eig_vals)
var_exp = [( i /tot ) * 100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
count = 0
sum = 0
for i in var_exp:
if sum<95:
sum = sum + i
count = count + 1
print("no fo components req to cover 95 % variance:", count)
plt.bar(list(range(1,19)),var_exp,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,19)),cum_var_exp, where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
# Ploting
plt.figure(figsize = (10,5))
plt.bar(range(1, eig_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eig_vals.size + 1), cum_var_exp, where = 'mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
Now 7 dimensions seems better compared to 18 dimensions. With 7 variables we can explain over 95% of the variation in the original data!
data_new = pd.read_csv("vehicle-1.csv")
nulls = data.isnull().sum()
nulls[nulls > 0]
data_new.fillna(data.median(), inplace=True)
for idx, col in enumerate(data_new.columns[:-1]):
q1 = data_new[col].quantile(0.25)
q3 = data_new[col].quantile(0.75)
low = q1 - 1.5*(q3 - q1)
high = q3 + 1.5*(q3 - q1)
data_new = data_new[(data_new[col] < high) & (data_new[col] > low)]
#Seperating X and y from data dataframe
X = data.drop(['class'], axis = 1)
y = data['class']
x_train_pca, x_test_pca, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
x_train_pca = StandardScaler().fit_transform(x_train_pca)
x_test_pca = StandardScaler().fit_transform(x_test_pca)
pca7 = PCA(n_components=7)
pca7.fit(x_train_pca)
x_train_pca = pca7.transform(x_train_pca)
x_test_pca = pca7.transform(x_test_pca)
print("Eigen vectors: ",pca7.components_)
print("Eigen Values: ",pca7.explained_variance_ratio_)
XScaled = stats.zscore(X)
Xpca7 = pca7.transform(XScaled)
sns.pairplot(pd.DataFrame(Xpca7))
sns.pairplot(pd.DataFrame(Xpca7),diag_kind='kde')
Observation:
# defining parameter range
param_grid = {'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000],
'gamma': [1000, 100, 10, 1, 0.5, 0.1, 0.01, 0.001],
'kernel': ['rbf','linear','poly']}
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=3)
grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose = 0)
# fitting the model for grid search
best_model_pca=grid.fit(x_train_pca, y_train)
# View best hyperparameters
print('Best gamma:', best_model_pca.best_estimator_.get_params()['gamma'])
print('Best C:', best_model_pca.best_estimator_.get_params()['C'])
print('Best kernel:', best_model_pca.best_estimator_.get_params()['kernel'])
best_gamma_pca=best_model_pca.best_estimator_.get_params()['gamma']
best_c_pca=best_model_pca.best_estimator_.get_params()['C']
best_kernel_pca=best_model_pca.best_estimator_.get_params()['kernel']
print("Best Score:", best_model_pca.best_score_)
svm_pca=SVC(C=best_c_pca,gamma=best_gamma_pca,kernel=best_kernel_pca)
svm_pca.fit(x_train_pca,y_train)
predicted_labels = svm_pca.predict(x_test_pca)
print("Accuracy: ",svm_pca.score(x_test_pca, y_test))
print(" "*100)
#Confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score
print(confusion_matrix(y_test, predicted_labels))
print("Accuracy Score:",accuracy_score(y_test, predicted_labels))
target_names = ['car', 'bus','van']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, predicted_labels, target_names=target_names))
# we run kfold cross validation with the default parameters
num_folds = 20
seed = 42
kfold = KFold(n_splits=num_folds, random_state=seed)
model = SVC()
model.fit(x_train_pca,y_train)
results = cross_val_score(model, x_test_pca, y_test, cv=kfold)
print(results)
print("max accuracy:", results.max())
print("min accuracy:", results.min())
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
# we run kfold cross validation with the best parameters found before
num_folds = 20
seed = 42
kfold = KFold(n_splits=num_folds, random_state=seed)
model = SVC(C=best_c,gamma=best_gamma,kernel=best_kernel)
model.fit(x_train_pca,y_train)
results = cross_val_score(model, x_test_pca, y_test, cv=kfold)
print(results)
print("max accuracy:", results.max())
print("min accuracy:", results.min())
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Model", "Data", "Accuracy", "gamma", "c", "kernel"]
x.add_row(["SVM", 'Raw', 98.86,best_gamma, best_c, best_kernel])
x.add_row(["SVM", 'PCA', 93.69,best_gamma_pca, best_c_pca, best_kernel_pca])
print(x)
x = PrettyTable()
x.field_names = ["Model", "Data", "Accuracy", "gamma", "c", "kernel"]
x.add_row(["SVM with 10 fold", 'Raw - Default parameters of SVM', 93.94, best_gamma, best_c, best_kernel])
x.add_row(["SVM with 10 fold", 'Raw - Best parameters of SVM', 95.81,1,1,'rbf'])
x.add_row(["SVM with 10 fold", 'PCA - Default parameters of SVM', 91.07, best_gamma_pca, best_c_pca, best_kernel_pca])
x.add_row(["SVM with 10 fold", 'PCA - Best parameters of SVM', 91.75,1,1,'rbf'])
print(x)